/*
Purpose: crosswalk different sources of occupation codes (SOC and O-NET) and
add math level variable to ACS data
*/

import excel "data/raw/2019_to_SOC_Crosswalk.xlsx", clear ///
firstrow cellrange(A4:D1020)

// getting Excel crosswalk files into Stata

ren SOCCode soc_code2018
ren ONETSOC2019Code onet_code

keep soc_code2018 onet_code

save "data/clean/soc_xwalk_2018.dta", replace

import excel "data/raw/soc_2010_to_2018_crosswalk.xlsx", clear ///
firstrow cellrange(A9:D909)

ren SOCCode soc_code2010
ren C soc_code2018

keep soc_code2010 soc_code2018

save "data/clean/soc_xwalk_2010.dta", replace

import excel "data/raw/soc_2000_to_2010_crosswalk.xls", clear ///
firstrow cellrange(A7:D868)

ren SOCcode soc_code2000
ren C soc_code2010

keep soc_code2000 soc_code2010

save "data/clean/soc_xwalk_2000.dta", replace

// now merging crosswalks so that each year's SOC codes are matched to O-Net

// 2018: no merging

use "data/clean/soc_xwalk_2018.dta", clear

preserve
gen socyear = 2018
ren soc_code2018 occsoc

tempfile soc2018
save `soc2018', replace
restore

// 2010: merge with 2018, keeping only observations in both data sets
joinby soc_code2018 using "data/clean/soc_xwalk_2010.dta"
keep soc_code2010 onet_code

preserve
gen socyear = 2010
ren soc_code2010 occsoc

tempfile soc2010
save `soc2010', replace
restore

// 2000: merge with 2010, keeping only observations in both data sets
joinby soc_code2010 using "data/clean/soc_xwalk_2000.dta"
keep soc_code2000 onet_code


gen socyear = 2000
ren soc_code2000 occsoc

// now append 2010 and 2018 crosswalks
append using `soc2010'
append using `soc2018'

save "data/clean/soc_xwalk_all.dta", replace

* import occupation math scores

import delimited "data/raw/Mathematics.csv", clear varnames(1)

* some occupations have "not available" level, replacing these with missing
* others have "not relevant" level. by inspection these seem to be the
* least mathematical jobs, so replacing these with zeros

replace level = "" if level=="Not available"
replace level = "0" if level=="Not relevant"

* destring "level" variable, replacing "not available" with
* missing values

destring level, replace force
corr importance level //0.94, using level for now

ren code onet_code

keep onet_code level
ren level math_level

* note below is 1:m as some SOC codes associated with multiple professions
* in math score data
merge 1:m onet_code using "data/clean/soc_xwalk_all.dta"
keep if _merge==3 //only keeping professions with Census code and importance
// scores; note that there is a significant number missing importance scores
drop _merge

// aggregating multiple O-Net occupations within a single Census code
// using simple mean

collapse (mean) math_level, by(occsoc socyear)

save "data/clean/math_level", replace